In [63]:
import pandas as pd
def preview(df):
print("Dimensions: {0} rows x {1} columns".format(df.shape[0], df.shape[1]))
return df.head()
jeopardy = pd.read_csv("jeopardy.csv")
preview(jeopardy)
Out[63]:
In [64]:
print(jeopardy.columns)
# Rename columns
jeopardy.rename(columns=lambda name: name.lstrip(), inplace=True)
In [65]:
print(jeopardy.columns)
In [66]:
import string
def norm_words(words):
words = words.lower().translate(None, string.punctuation)
return words
jeopardy["clean_question"] = jeopardy["Question"].apply(norm_words)
jeopardy["clean_answer"] = jeopardy["Answer"].apply(norm_words)
jeopardy.head()
Out[66]:
In [67]:
def norm_value(value):
try:
value = int(value.translate(None, string.punctuation))
except:
value = 0
return value
jeopardy["clean_value"] = jeopardy["Value"].apply(norm_value)
jeopardy["Air Date"] = pd.to_datetime(jeopardy["Air Date"])
print(jeopardy.dtypes)
jeopardy.head()
Out[67]:
In [73]:
def ans_in_q(row):
match_count = 0
split_answer = row["clean_answer"].split(" ")
split_question = row["clean_question"].split(" ")
try:
split_answer.remove("the")
except:
pass
if len(split_answer) == 0:
return 0
else:
for word in split_answer:
if word in split_question:
match_count += 1
return match_count / len(split_answer)
jeopardy["answer_in_question"] = jeopardy.apply(ans_in_q, axis=1)
print(jeopardy["answer_in_question"].mean())
jeopardy[jeopardy["answer_in_question"] > 0].head()
Out[73]:
In [85]:
jeopardy[(jeopardy["answer_in_question"] > 0) & (jeopardy["clean_question"].apply(string.split).apply(len) > 6)].head()
Out[85]:
Only 0.6% of the answers appear in the questions itself. Out of this 0.6%, a sample of the questions shows that they are all multiple choice questions, which concludes that it is very unlikely that the answer will be in the question itself.
In [102]:
jeopardy = jeopardy.sort_values(by="Air Date")
question_overlap = []
terms_used = set()
for index, row in jeopardy.iterrows():
match_count = 0
split_question = row["clean_question"].split(" ")
for word in split_question:
if len(word) < 6:
split_question.remove(word)
for word in split_question:
if word in terms_used:
match_count += 1
terms_used.add(word)
if len(split_question) > 0:
match_count /= float(len(split_question))
question_overlap.append(match_count)
jeopardy["question_overlap"] = question_overlap
print(jeopardy["question_overlap"].mean())
In [105]:
jeopardy.tail()
Out[105]:
In [106]:
def value(row):
if row["clean_value"] > 800:
value = 1
else:
value = 0
return value
jeopardy["high_value"] = jeopardy.apply(value, axis=1)
jeopardy.head()
Out[106]:
In [ ]: